Explore different perspectives and approaches to create more truthful and enlightening data visualisation
The main purpose of this assignment is to evaluate the take-home exercise 1 from the perspective of clarity and aesthetics, and to refine the diagram based on the principles we learned in the previous two lessons. The dataset of this project is VAST Challenge 2022. The operation was carried out on Rstudio and main packages used are tidyverse and ggplot2 extensions.
We will do the data preparation in a similar procedure of the original creation:
Installing and loading the required libraries
Importing the dataset
# read csv file
participants <- read_csv('data/Participants.csv')
Data Wrangling
# rename 'HighSchoolOrCollege'
participants$educationLevel <- sub('HighSchoolOrCollege',
'High School or College',
participants$educationLevel)
# rename columns
participants <- participants %>%
rename('ParticipantID' = 'participantId',
'HouseholdSize' = 'householdSize',
'HaveKids' = 'haveKids',
'Age' = 'age',
'EducationLevel' = 'educationLevel',
'InterestGroup' = 'interestGroup',
'Joviality' = 'joviality')
# Age variable is binned with the following code chunk:
brks <- c(17, 20, 25, 30, 35, 40, 45, 50, 55, 60)
grps <- c('20 & Below', '21-25', '26-30', '31-35', '36-40', '41-45','46-50', '51-55', '56-60')
participants$AgeGroup <- cut(participants$Age, breaks=brks, labels = grps)
A histogram is used to reveal the distribution of residents’ age.
knitr::include_graphics("C:/KatherineHuan/ISSS608/th_ex/data/pic/th2_or1.PNG")
Clarity
Aesthetic
library(plotly)
# Bar chart plot
Age <- ggplot(data= participants,
aes(x= AgeGroup)) +
geom_bar(fill= 'light blue') +
ylim(0, 150) +
# annotation
geom_text(stat = 'count',
aes(label= paste0(round(stat(count)/sum(stat(count))*100,
1), '%')), vjust= -0.5, size= 2.5) +
labs(y= 'No. of\nResidents', x= 'AgeGroup',
title = "Distribution of Residents' Age",
subtitle= 'Demographics in Engagement, Ohio',
caption = "Source: VAST Challenge 2022") +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.title.y= element_text(hjust = 0.5),
axis.ticks.x= element_blank(),
panel.background= element_blank(),
panel.grid.major = element_line(size= 0.2, color = "grey"),
axis.line= element_line(color= 'grey'),
plot.caption = element_text(hjust = 0))
ggplotly(Age)
Clarity
Aesthetic
Line chart is used to reveal the distribution of residents’ joviality according to their age and education level.
knitr::include_graphics("C:/KatherineHuan/ISSS608/th_ex/data/pic/th2_or2.PNG")
Clarity
Aesthetic
a. Joviality in relation to Age and Education Level
# Plot Joviality in relation to Age and Education Level
p1 <- ggplot(participants,
aes(x= AgeGroup, y= Joviality)) +
geom_violin(fill= '#66cdaa',
scale = 'count',alpha = .9, trim = FALSE,
color= NA) +
geom_boxplot(width= 0.2,
color = '#065535',
alpha= 0.8) +
stat_summary(aes(color= 'Mean'),
fun= 'mean',
size= 0.05) +
geom_hline(aes(yintercept = 0.5),
color= 'black',
linetype= 'dashed',
size= .6) +
ylim(-0.3,1.5) +
# Annotation
scale_color_manual(name= 'Statistics',
values = (Mean= '#f6546a')) +
# Add on Education Level Factor
facet_grid(~factor(EducationLevel)) +
labs(title= 'Joviality Distribution in Relation to Age',
x= 'Age Group', y= 'Joviality',
subtitle= 'Demographics in Engagement, Ohio',
caption = "Source: VAST Challenge 2022") +
theme(panel.background = element_blank(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.title.y = element_text(angle=0,vjust = 0.5),
axis.ticks.x = element_blank(),
axis.line= element_line(color= 'grey'),
axis.text.x = element_text(size=8,angle=90),
panel.grid.major.y = element_line(color= 'grey', size = 0.1),
plot.caption = element_text(hjust=0),
legend.key = element_rect(fill= NA))
p1
To detect the relationship between every two variables, we will plot another 3 charts for better understand how is the incluence in between.
b. Joviality in relation to Age
# Plot Joviality in relation to Age
p2 <- ggplot(participants,
aes(x= AgeGroup, y= Joviality)) +
geom_violin(fill= '#66cdaa',
scale = 'count',alpha = .9, trim = FALSE,
color= NA) +
geom_boxplot(width= 0.2,
color = '#065535',
alpha= 0.8) +
stat_summary(aes(color= 'Mean'),
fun= 'mean',
size= 0.2) +
geom_hline(aes(yintercept = 0.5),
color= 'black',
linetype= 'dashed',
size= .6) +
# Annotation
scale_color_manual(name= 'Statistics',
values = (Mean= '#f6546a')) +
labs(title= 'Joviality Distribution in Relation to Age',
x= 'Age Group', y= 'Joviality',
subtitle= 'Demographics in Engagement, Ohio',
caption = "Source: VAST Challenge 2022") +
theme(panel.background = element_blank(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.title.y = element_text(angle=0,vjust = 0.5),
axis.ticks.x = element_blank(),
axis.line= element_line(color= 'grey'),
axis.text.x = element_text(size=8,angle=0),
panel.grid.major.y = element_line(color= 'grey', size = 0.1),
plot.caption = element_text(hjust=0),
legend.key = element_rect(fill= NA))
p2
c. Joviality in relation to Education Level
# Plot Joviality in relation to Education Level
p3 <- ggplot(participants,
aes(x= EducationLevel, y= Joviality)) +
geom_violin(fill= '#66cdaa',
scale = 'count',alpha = .9, trim = FALSE,
color= NA) +
geom_boxplot(width= 0.08,
color = '#065535',
alpha= 0.8) +
stat_summary(aes(color= 'Mean'),
fun= 'mean',
size= 0.2) +
geom_hline(aes(yintercept = 0.5),
color= 'black',
linetype= 'dashed',
size= .6) +
# Annotation
scale_color_manual(name= 'Statistics',
values = (Mean= '#f6546a')) +
# Add on Education Level Factor
#facet_grid(~factor(AgeGroup)) +
labs(title= 'Joviality Distribution in Relation to Education Level',
x= 'Education Level', y= 'Joviality',
subtitle= 'Demographics in Engagement, Ohio',
caption = "Source: VAST Challenge 2022") +
theme(panel.background = element_blank(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.title.y = element_text(angle=0,vjust = 0.5),
axis.ticks.x = element_blank(),
axis.line= element_line(color= 'grey'),
axis.text.x = element_text(size=8,angle=0),
panel.grid.major.y = element_line(color= 'grey', size = 0.1),
plot.caption = element_text(hjust=0),
legend.key = element_rect(fill= NA))
p3
d. Education Level in relation to Age
# Data manipulation
percent <-
participants %>%
group_by(AgeGroup, EducationLevel) %>%
summarise(edu_size= n()) %>%
mutate(edu_pct= percent(edu_size/sum(edu_size)))
#Plot Education Level in relation to Age
p4 <- ggplot(data= percent,
aes(x= factor(AgeGroup),
y= edu_pct,
group= EducationLevel,
color= factor(EducationLevel))) +
geom_line() +
scale_color_discrete(name= 'Education Level') +
scale_y_continuous(labels = percent_format(),
expand = c(0.2, 0.2)) +
labs(x= 'Age Group',y= 'Propotions',
title = "Composition of Residents' Education Level in Age Groups")+
theme(legend.position = 'top', legend.direction = 'horizontal',
axis.title.y= element_text(angle=90), axis.ticks.x= element_blank(),
panel.background= element_blank(), axis.line= element_line(color= 'grey'),
panel.grid.major = element_line(size= 0.2, color = "grey"),
legend.key = element_rect(fill= NA), legend.title = element_text(size = 8.5),
plot.title = element_text(hjust=0.5))
p4
Clarity
Aesthetic
There are more than one way to present the data.It’s not that the more complex the graph, the better to show the ability to make the graph, we should avoid pack too much information into one graph, but use simple and clear graph to make the audience understand our point of view more easily.
Derived values reveal more interesting patterns than absolute values.For example, the proportion of each age group. So, pre-attentive processing is important for design of visualizations such as the calculation of proportion of education level.